import numpy as np
import pandas as pd
import re
df = pd.read_excel('最新发布的北京二手房数据.xlsx')
pd.set_option('display.unicode.east_asian_width', True)

def dealYear(year):
    num = year
    if type(year) == str:
        num = 2022 - int(year)
    return num
def dealType(ser):
    data = np.zeros((len(ser),), dtype='int')
    df = pd.DataFrame({'室': data, '厅': data})
    for i in ser.index:
        if ser[i] != '车位':
            rec = re.findall(r'\d+', ser[i])
            df.loc[i, '室'] = int(rec[0])
            df.loc[i, '厅'] = int(rec[1])
    return df
df['户型'] = df['户型'].str.replace('房间', '室')
df = df.join(dealType(df['户型']))
df['年份'] = df['年份'].str.replace('年建', '').apply(lambda x: dealYear(x))
df['面积'] = df['面积'].str.replace('平米', '').astype('float')
df['总价'] = df['总价'].str.replace('万', '').astype('float')
df['单价'] = df['单价'].str.replace(',', '').str.replace('元/平', '').astype('float')
df = df.rename({'面积': '面积（平方米）', '年份': '房龄', '总价': '总价（万元）', '单价': '单价（元/平方米）'}, axis='columns')
print(df[['面积（平方米）', '房龄', '总价（万元）', '单价（元/平方米）', '室', '厅']])
df1 = df[df['户型'] == '车位']
print('包含车位的行: \n',df1)
print('删除户型异常值前数据的行数:',len(df))
df = df.drop(df1.index)
print('删除户型异常值后数据的行数:',len(df))
df2 = df['房龄'][(df['房龄'] <0) | (df['房龄']>50)]
print('房龄小于0或大于50的行: \n',len(df))
df = df.drop(df2.index)
print('删除房龄异常值后数据的行数:',len(df))
df3 = df.duplicated(keep=False)
print('所有列重复的行: \n',df[df3 == True])
print('删除重复值前数据的行数:',len(df))
df = df.drop_duplicates()
print('删除重复值后数据的行数:',len(df))
print('删除房龄缺失值前数据的行数:',len(df))
df = df.dropna(subset='房龄')
print('删除房龄缺失值后数据的行数:',len(df))
df = df.fillna({'房源标签':'不近地铁'})
print('房源标签替换缺失值后的数据: \n',df.iloc[:,-10:])
bins=[1,60,90,120,150,180,210,520]
area_label=['60平方米以下','60~90平方米','90~120平方米','120~150平方米','150~180平方米','180~120平方米','210平方米以上']
df['面积区间'] = pd.cut(list(df['面积（平方米）']),bins,labels=area_label)
bins = [1,200,400,600,800,1000,2000,4500]
totalPrice_label = ['200万元以下','200万~400万','400万~600万','600万~800万','800万~1000万','1000万~2000万','2000万元以上']
df['总价区间'] = pd.cut(list(df['总价（万元）']),bins,labels=totalPrice_label)
print(df.iloc[:,-5:])
df =df.reset_indesx(drop=True)
df = df.join(pd.get_dummies(df['所在区']))
df = df.join(pd.get_dummies(df['装修']))
df = df.drop(df[df['结构'] == '暂无数据'].index)
df = df.join(pd.get_dummies(df['结构']))
df = df.jpin(pd.get_dummies(df['房源标签']))
def get